# SET DIRECTORIES
setwd("/path/to/SOEPdata/folder") # set directory of SOEP data folder
rep_path <- "/path/to/replication/folder" # define folder of replication

# LOAD PACKAGES
pacman::p_load(plyr, dplyr, stringr, rddtools, rdrobust, rddensity, 
               scales, foreign, haven, purrr, pbmcapply, devtools, 
               tidyr, ggplot2, lubridate, ggthemes, outreg, stargazer,  
               gtable, grid, gridExtra, cowplot, tidyquant, jtools, 
               modelsummary, ggsci)

##############################################

# PHRF
meta <- c('pid')
vars <- c('bcphrf', 'bdphrf', 'bephrf', 'bdpbleib', 'bepbleib')
df_weight <- read_dta(file = "raw/phrf.dta", col_select = all_of(c(meta, vars))) %>%
  mutate(weight1214 = bcphrf*bdpbleib*bepbleib, 
         weight12 = bcphrf, weight13 = bdphrf, weight14 = bephrf) %>%
  select(pid, weight1214, weight12, weight13, weight14)

# PPATHL
meta <- c('pid', 'syear')
vars <- c('sex', 'gebjahr')
df_demo <- read_dta(file = "ppathl.dta", col_select = all_of(c(meta, vars))) %>%
  mutate_at(vars(vars), funs(ifelse(. < 0, NA, .))) %>%
  mutate(male = (sex == 1) %>% as.numeric(), female = (sex == 2) %>% as.numeric(), age = c(syear-gebjahr)) %>%
  mutate_at(vars('age'), funs(ifelse(. < 0, 0, .))) %>%
  mutate(ag = cut(age, c(0, 25, 35, 45, 55, 65, 75), include.lowest = T) %>% as.character() %>% gsub('[^0-9]', '', .)) %>%
  fastDummies::dummy_cols(., select_columns = c('ag'), ignore_na = T) %>%
  select(pid, syear, gebjahr, age, male, female, ag_025, ag_2535, ag_3545, ag_4555, ag_5565, ag_6575)

###########

# PGEN
meta <- c('pid', 'syear')
vars <- c('pgfamstd', 'pgemplst', 'pgisced97', 'pgcasmin', 'pgmonth', 'pgbilzeit', 
          'pgexpft', 'pgexppt', 'pgpsbil', 'pgerwzeit', 'pgtatzeit', 'pgvebzeit', 
          'pguebstd', 'pgpartnr')
df_pgen <- read_dta(file = "pgen.dta", col_select = all_of(c(meta, vars))) %>%
  mutate_at(vars(vars), funs(ifelse(. < 0, NA, .))) %>%
  mutate(maritalCat = pgfamstd, labCat = pgemplst, educCat = pgisced97, schoolCat = pgpsbil) %>%
  fastDummies::dummy_cols(., select_columns = c('maritalCat', 'labCat', 'educCat', 'schoolCat'), ignore_na = T) %>%
  mutate(married = pgfamstd %>% replace(., !. %in% c(1) & !is.na(.), 0) %>% replace(., . > 0, 1)) %>%
  mutate(fulltime = pgemplst %>% replace(., !. %in% c(1) & !is.na(.), 0) %>% replace(., . > 0, 1),
         parttime = pgemplst %>% replace(., !. %in% c(2) & !is.na(.), 0) %>% replace(., . > 0, 1),
         apprentice = pgemplst %>% replace(., !. %in% c(3) & !is.na(.), 0) %>% replace(., . > 0, 1),
         irregwork = pgemplst %>% replace(., !. %in% c(4) & !is.na(.), 0) %>% replace(., . > 0, 1),
         noempl = pgemplst %>% replace(., !. %in% c(5) & !is.na(.), 0) %>% replace(., . > 0, 1),
         disabled = pgemplst %>% replace(., !. %in% c(6) & !is.na(.), 0) %>% replace(., . > 0, 1)) %>%
  mutate(HigherVocEdu = pgisced97 %>% replace(., !. %in% c(4:6) & !is.na(.), 0) %>% replace(., . > 0, 1)) %>%
  mutate(eduyrs = pgbilzeit, ftexperience = pgexpft, ptexperience = pgexppt) %>%
  mutate(ppid = pgpartnr) %>%
  mutate(INTYEAR = syear, INTMONTH = str_pad(pgmonth, width = 2, pad = '0')) %>%
  mutate(INTDATE = paste(INTYEAR, INTMONTH, '01', sep = '-') %>% as.Date()) %>%
  mutate(INTWEEK = week(INTDATE)) %>%
  select(pid, syear, INTDATE, INTYEAR, INTMONTH, INTWEEK, maritalCat, labCat, educCat, everything(.)) %>%
  select(-all_of(vars))

###########

# HGEN
meta <- c('hid', 'syear')
vars <- c('hghinc')
df_hgen <- read_dta(file = "hgen.dta", col_select = all_of(c(meta, vars))) %>%
  mutate_at(vars(vars), funs(ifelse(. < 0, NA, .))) %>%
  mutate(income = hghinc) %>%
  select(hid, syear, income, everything(.)) %>%
  select(-all_of(vars))

###########

# BIOBIRTH
meta <- c('pid')
vars <- c('bioyear', 'kidgeb01', 'kidmon01', 'kidgeb02', 'kidmon02', 'kidgeb03', 'kidmon03')
df_biobirth <- read_dta(file = "biobirth.dta", col_select = all_of(c(meta, vars))) %>%
  mutate_at(vars(vars), funs(ifelse(. < 0, NA, .))) %>%
  mutate(BYEAR01 = kidgeb01, BMONTH01 = str_pad(as.numeric(kidmon01), 2, pad = '0')) %>%
  mutate(BDATE01 = paste(BYEAR01, BMONTH01, '01', sep = '-') %>% replace(., grepl('NA', .), NA) %>% as.Date(.)) %>%
  mutate(BYEAR02 = kidgeb02, BMONTH02 = str_pad(as.numeric(kidmon02), 2, pad = '0')) %>%
  mutate(BDATE02 = paste(BYEAR02, BMONTH02, '01', sep = '-') %>% replace(., grepl('NA', .), NA) %>% as.Date(.)) %>%
  mutate(BYEAR03 = kidgeb03, BMONTH03 = str_pad(as.numeric(kidmon03), 2, pad = '0')) %>%
  mutate(BDATE03 = paste(BYEAR03, BMONTH03, '01', sep = '-') %>% replace(., grepl('NA', .), NA) %>% as.Date(.)) %>%
  subset(., !is.na(BDATE01)) %>%
  mutate(BDATE01str = as.character(BDATE01), BDATE01num = (BDATE01 - as.Date('1900-01-01')) %>% as.numeric()) %>%
  mutate(BDATE02str = as.character(BDATE02), BDATE02num = (BDATE02 - as.Date('1900-01-01')) %>% as.numeric()) %>%
  mutate(BDATE03str = as.character(BDATE03), BDATE03num = (BDATE03 - as.Date('1900-01-01')) %>% as.numeric()) %>%
  select(pid, bioyear, BDATE01, BDATE01num, BDATE01str, BDATE02, BDATE02num, BDATE02str, BDATE03, BDATE03num, BDATE03str)

# GET NUMBER OF CHILDREN
df_nchild <- df_biobirth %>%
  select(pid, BDATE01, BDATE02, BDATE03) %>%
  gather(., 'date', 'val', -pid) %>%
  subset(., !is.na(val)) %>%
  group_by(pid) %>% summarize(nchild = n()) %>% ungroup()

###########

# PL
meta <- c('pid', 'hid', 'syear')
vars <- c('pmonin', 'plh0013_h', 'plh0012_h', 'plh0333', 'plh0004', 'plh0175', 'plh0007', 
          'plh0011_h', 'plh0111', 'plh0335', 'plc0311', 'plh0216', 'plb0021',
          'plb0037_h', 'plb0183_h', 'plb0193', 'plh0244', 'plh0155', 'plh0162',
          'plh0176', 'plh0173', 'plh0182', 'plh0180', 'plh0188', 'plh0344',
          'plh0042', 'plj0046', 'plh0035', 'plh0033', 'plh0263_h', 'plc0014_h', 
          'plc0342')
df_pl <- read_dta(file = "pl.dta", col_select = all_of(c(meta, vars))) %>%
  mutate_at(vars(vars), funs(ifelse(. < 0, NA, .))) %>%
  mutate(partysupport = plh0012_h) %>%
  mutate(partySPD = case_when(plh0012_h %in% c(1, 9, 10, 11, 12, 17, 31) ~ 1, TRUE ~ 0) %>% replace(., is.na(plh0012_h), NA),
         partyUNION = case_when(plh0012_h %in% c(2, 10, 13, 14, 15, 20, 21, 30, 3, 13, 22, 30) ~ 1, TRUE ~ 0) %>% replace(., is.na(plh0012_h), NA),
         partyFDP = case_when(plh0012_h %in% c(4, 11, 14, 22, 23, 24, 25) ~ 1, TRUE ~ 0) %>% replace(., is.na(plh0012_h), NA),
         partyGRUNE = case_when(plh0012_h %in% c(5, 9, 15, 16, 18, 23) ~ 1, TRUE ~ 0) %>% replace(., is.na(plh0012_h), NA),
         partyLINKE = case_when(plh0012_h %in% c(6, 16, 17, 19, 20, 24) ~ 1, TRUE ~ 0) %>% replace(., is.na(plh0012_h), NA),
         partyAFD = case_when(plh0012_h %in% c(27, 30, 31) ~ 1, TRUE ~ 0) %>% replace(., is.na(plh0012_h), NA)) %>%
  mutate(partident = plh0011_h %>% replace(., . == 2, 0) %>% replace(., . == 3, NA)) %>%
  mutate(partyintens = plh0013_h) %>%
  mutate(worrypensDummy = plh0335 %>% replace(., .<2, 1) %>% replace(., . >= 2, 0)) %>%
  mutate(retired = plc0311 %>% replace(., . == 2, 0),
         unemployed = plb0021 %>% replace(., . == 2, 0),
         unlimited = plb0037_h %>% replace(., . %in% c(2, 3, 4), 0),
         limited = plb0037_h %>% replace(., . %in% c(1, 3, 4), 0),
         selfemp = plb0037_h %>% replace(., . %in% c(1, 2, 3), 0),
         income2 = plc0014_h) %>%
  mutate(lrscale = plh0004, polintrest = plh0007, satishhinc = plh0175, polactive = plh0111) %>%
  mutate(INTDATE = paste(syear, str_pad(pmonin, 2, pad = '0'), '01', sep = '-') %>% as.Date()) %>%
  select(all_of(c('pid', 'hid', 'syear', 
                  'partysupport', "partySPD", "partyUNION", "partyFDP", "partyGRUNE", "partyLINKE", "partyAFD",
                  'partyintens', "lrscale", 'partident', "polintrest", 'polactive', "satishhinc", 
                  'retired', 'unemployed', 'unlimited', 'limited', 'selfemp', 'income2')))

###

# PL - CROSS-SECTION
meta <- c('pid', 'hid', 'syear')
vars <- c('pli0097_h', 'plh0392')
df_pl_cs <- read_dta(file = "pl.dta", col_select = all_of(c(meta, vars))) %>%
  mutate_at(vars(vars), funs(ifelse(. < 0, NA, .))) %>%
  mutate(politmember = as.numeric(pli0097_h != 5)) %>%
  select(all_of(c('pid', 'politmember'))) %>%
  group_by(pid) %>% summarize_all(mean, na.rm = T) %>% ungroup() %>%
  mutate(politmember = as.numeric(politmember > 0))

###

# PL - OTHER POLICY PREFERENCES
meta <- c('pid', 'hid', 'syear')
vars <- c('plh0040', 'plh0039', 'plh0037', 'plh0268', 'plh0035', 'plh0033', 'plj0046', 'plh0022', 'plh0021', 
          'plh0020', 'plh0026', 'plh0016', 'plh0017', 'plh0018', 'plh0019')
df_pl_cs_otherOpinion <- read_dta(file = "pl.dta", col_select = all_of(c(meta, vars))) %>%
  subset(., syear >= 2012 & syear <= 2017) %>%
  mutate_at(vars(vars), funs(ifelse(. < 0, NA, .))) %>%
  mutate(OTHgovtasksick = plh0022, OTHgovtaskfinsick = plh0021, OTHgovtaskunemployed = plh0019, OTHgovtaskjobs = plh0020,
         OTHworrycrime = plh0040, OTHworryterrorism = plh0039, OTHworryeuro = plh0268, OTHworryimmigration = plj0046) %>%
  group_by(pid) %>% summarize_at(vars(contains('OTH')), mean, na.rm = T) %>% ungroup() %>%
  select(all_of(c('pid')), contains('OTH'))

###

# PARTY AFFILIATION OF OTHER HOUSEHOLD MEMBERS
meta <- c('pid', 'hid', 'syear')
vars <- c('plh0012_h')
df_pl_hhmem <- read_dta(file = "pl.dta", col_select = all_of(c(meta, vars))) %>%
  subset(., pid %in% c(df_demo %>% subset(., female == 0) %>% pull(pid) %>% unique())) %>%
  subset(., pid %in% c(df_pgen %>% pull(ppid) %>% unique())) %>%
  subset(., syear %in% c(2012:2014)) %>%
  group_by(hid, syear) %>% mutate(nsize = n()) %>% ungroup() %>% subset(., nsize == 1) %>%
  group_by(pid) %>% mutate(n_pid = n()) %>% ungroup() %>% subset(., n_pid >= 2) %>%
  mutate_at(vars(vars), funs(ifelse(. < 0, NA, .))) %>%
  mutate(partysupport = plh0012_h) %>%
  mutate(HHpartySPD = case_when(plh0012_h %in% c(1, 9, 10, 11, 12, 17, 31) ~ 1, TRUE ~ 0) %>% replace(., is.na(plh0012_h), NA),
         HHpartyUNION = case_when(plh0012_h %in% c(2, 10, 13, 14, 15, 20, 21, 30, 3, 13, 22, 30) ~ 1, TRUE ~ 0) %>% replace(., is.na(plh0012_h), NA),
         HHpartyFDP = case_when(plh0012_h %in% c(4, 11, 14, 22, 23, 24, 25) ~ 1, TRUE ~ 0) %>% replace(., is.na(plh0012_h), NA),
         HHpartyGRUNE = case_when(plh0012_h %in% c(5, 9, 15, 16, 18, 23) ~ 1, TRUE ~ 0) %>% replace(., is.na(plh0012_h), NA),
         HHpartyLINKE = case_when(plh0012_h %in% c(6, 16, 17, 19, 20, 24) ~ 1, TRUE ~ 0) %>% replace(., is.na(plh0012_h), NA),
         HHpartyAFD = case_when(plh0012_h %in% c(27, 30, 31) ~ 1, TRUE ~ 0) %>% replace(., is.na(plh0012_h), NA)) %>%
  select(all_of(c('hid', 'syear')), contains('HHparty')) %>%
  group_by(hid, syear) %>% summarize_at(vars(contains('HHparty')), mean, na.rm = T) %>% ungroup() %>%
  mutate_at(vars(contains('HHparty')), ~as.numeric(. > 0))

###

# BIP: TURNOUT 2013
meta <- c('pid', 'syear')
vars <- c('bep121')
df_turnout13 <- read_dta(file = "raw/bep.dta", col_select = all_of(c(meta, vars))) %>%
  mutate_at(vars(vars), funs(ifelse(. < 0, NA, .))) %>%
  mutate(turnout13 = bep121 %>% replace(., . == 29, NA)) %>%
  mutate(turnout13 = case_when(turnout13 == 28 ~ 0, turnout13 != 28 ~ 1)) %>%
  subset(., !is.na(turnout13)) %>% select(pid, syear, turnout13) %>% unique()

###

# TURNOUT CROSS-SECTION
# BAP: TURNOUT 2009
meta <- c('pid', 'syear')
vars <- c('bap126')
df_turnout09_cs <- read_dta(file = "raw/bap.dta", col_select = all_of(c(meta, vars))) %>%
  mutate_at(vars(vars), funs(ifelse(. < 0, NA, .))) %>%
  mutate(turnout09 = bap126 %>% replace(., . == 3, NA)) %>%
  mutate(turnout09 = as.numeric(turnout09 == 1)) %>%
  subset(., !is.na(turnout09)) %>% select(pid, turnout09_cs = turnout09) %>% unique()

# BIP: TURNOUT 2013
meta <- c('pid', 'syear')
vars <- c('bep121')
df_turnout13_cs <- read_dta(file = "raw/bep.dta", col_select = all_of(c(meta, vars))) %>%
  mutate_at(vars(vars), funs(ifelse(. < 0, NA, .))) %>%
  mutate(turnout13 = bep121 %>% replace(., . == 29, NA)) %>%
  mutate(turnout13 = case_when(turnout13 == 28 ~ 0, turnout13 != 28 ~ 1)) %>%
  subset(., !is.na(turnout13)) %>% select(pid, turnout13_cs = turnout13) %>% unique()

# BIP: TURNOUT 2017
meta <- c('pid', 'syear')
vars <- c('bip_175')
df_turnout17_cs <- read_dta(file = "raw/bip.dta", col_select = all_of(c(meta, vars))) %>%
  mutate_at(vars(vars), funs(ifelse(. < 0, NA, .))) %>%
  mutate(turnout17 = bip_175 %>% replace(., . == 29, NA)) %>%
  mutate(turnout17 = case_when(turnout17 == 28 ~ 0, turnout17 != 28 ~ 1)) %>%
  subset(., !is.na(turnout17)) %>% select(pid, turnout17_cs = turnout17) %>% unique()

df_turnout_cs <- df_turnout13_cs %>% left_join(., df_turnout09_cs, by = 'pid') %>% left_join(., df_turnout17_cs, by = 'pid')

################################################

# MERGE DATASETS
temp <-  df_pl %>%
  left_join(., df_pl_cs, by = c('pid')) %>%
  left_join(., df_turnout13, by = c('pid', 'syear')) %>%
  left_join(., df_turnout_cs, by = c('pid')) %>%
  left_join(., df_weight, by = c('pid')) %>%
  left_join(., df_biobirth, by = c('pid')) %>%
  left_join(., df_nchild, by = c('pid')) %>%
  left_join(., df_pgen, by = c('pid', 'syear')) %>%
  left_join(., df_demo, by = c('pid', 'syear')) %>%
  left_join(., df_hgen, by = c('hid', 'syear')) %>%
  left_join(., df_pl_hhmem, by = c('hid', 'syear')) %>%
  left_join(., df_pl_cs_otherOpinion, by = c('pid'))

################################################

# SUBSET DATASET
df <- temp %>%
  dplyr::select("pid", 'hid', "syear", 'weight1214', 'weight14', 
         'INTDATE', 'INTYEAR', 'INTMONTH', 'INTWEEK', "bioyear", 
         everything(.)) %>%
  arrange(pid, syear)

################################################

# STORE DATASET
fileName <- file.path(rep_path, "data/TEMPORARY.RData")
save(df, file = fileName)
